The first step is to load the necessary libraries:
library(readr) # Read data files
library(dplyr) # Manipulation tasks (filtering, selecting, summarizing)
library(tibble) # Ensures compatibility with tidyverse tools
library(ggplot2) # Used for plotting
library(DESeq2) # Perform DEA
library(BiocParallel) # Used for parallel computing
library(EDASeq) # Exploratory data analysis tools for RNA-Seq data
library(clusterProfiler) # Facilitates enrichment analysis
library(enrichplot) # Visualization tools for enrichment analysis
library(pheatmap) # Correlation matrix plot
library(ggfortify) # PCA autoplot
library(corrplot) # Plot correlation matrix
The packages to install come either from CRAN (The Comprehensive R Archive Network) or from Bioconductor.
The CRAN packages are straightforward and can be installed like this:
# Install CRAN packages
install.packages("readr")
Bioconductor packages are installed using the BiocManager package, which first we need to install:
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
Afterwards, it is possible to install the Bioconductor packages:
# Install Bioconductor packages
BiocManager::install("DESeq2")
First we load the necessary libraries and the .txt files we need to complete the exercises.
# Load the metadata
metadata <- read.delim("C:/Users/IhonaCorrea/Desktop/MASTER/COTM/Activity_2/sample_metadata.txt", header = TRUE, sep = "\t")
metadata
## FileName SampleName CellType Status
## 1 MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1 MCL1.DG luminal virgin
## 2 MCL1.DH_BC2CTUACXX_CAGATC_L002_R1 MCL1.DH basal virgin
## 3 MCL1.DI_BC2CTUACXX_ACAGTG_L002_R1 MCL1.DI basal pregnant
## 4 MCL1.DJ_BC2CTUACXX_CGATGT_L002_R1 MCL1.DJ basal pregnant
## 5 MCL1.DK_BC2CTUACXX_TTAGGC_L002_R1 MCL1.DK basal lactate
## 6 MCL1.DL_BC2CTUACXX_ATCACG_L002_R1 MCL1.DL basal lactate
## 7 MCL1.LA_BC2CTUACXX_GATCAG_L001_R1 MCL1.LA basal virgin
## 8 MCL1.LB_BC2CTUACXX_TGACCA_L001_R1 MCL1.LB luminal virgin
## 9 MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1 MCL1.LC luminal pregnant
## 10 MCL1.LD_BC2CTUACXX_GGCTAC_L001_R1 MCL1.LD luminal pregnant
## 11 MCL1.LE_BC2CTUACXX_TAGCTT_L001_R1 MCL1.LE luminal lactate
## 12 MCL1.LF_BC2CTUACXX_CTTGTA_L001_R1 MCL1.LF luminal lactate
# Load the counts file
counts <- read.delim("C:/Users/IhonaCorrea/Desktop/MASTER/COTM/Activity_2/GSE60450_Lactation-GenewiseCounts.txt", header = TRUE, sep = "\t")
head(counts)
## EntrezGeneID Length MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1
## 1 497097 3634 438
## 2 100503874 3259 1
## 3 100038431 1634 0
## 4 19888 9747 1
## 5 20671 3130 106
## 6 27395 4203 309
## MCL1.DH_BC2CTUACXX_CAGATC_L002_R1 MCL1.DI_BC2CTUACXX_ACAGTG_L002_R1
## 1 300 65
## 2 0 1
## 3 0 0
## 4 1 0
## 5 182 82
## 6 234 337
## MCL1.DJ_BC2CTUACXX_CGATGT_L002_R1 MCL1.DK_BC2CTUACXX_TTAGGC_L002_R1
## 1 237 354
## 2 1 0
## 3 0 0
## 4 0 0
## 5 105 43
## 6 300 290
## MCL1.DL_BC2CTUACXX_ATCACG_L002_R1 MCL1.LA_BC2CTUACXX_GATCAG_L001_R1
## 1 287 0
## 2 4 0
## 3 0 0
## 4 0 10
## 5 82 16
## 6 270 560
## MCL1.LB_BC2CTUACXX_TGACCA_L001_R1 MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1
## 1 0 0
## 2 0 0
## 3 0 0
## 4 3 10
## 5 25 18
## 6 464 489
## MCL1.LD_BC2CTUACXX_GGCTAC_L001_R1 MCL1.LE_BC2CTUACXX_TAGCTT_L001_R1
## 1 0 0
## 2 0 0
## 3 0 0
## 4 2 0
## 5 8 3
## 6 328 307
## MCL1.LF_BC2CTUACXX_CTTGTA_L001_R1
## 1 0
## 2 0
## 3 0
## 4 0
## 5 10
## 6 342
Let’s check if the metadata is aligned with the counts matrix:
counts_columns <- colnames(counts)[-c(1, 2)] # Exclude the first two columns that aren't samples
counts_columns
## [1] "MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1" "MCL1.DH_BC2CTUACXX_CAGATC_L002_R1"
## [3] "MCL1.DI_BC2CTUACXX_ACAGTG_L002_R1" "MCL1.DJ_BC2CTUACXX_CGATGT_L002_R1"
## [5] "MCL1.DK_BC2CTUACXX_TTAGGC_L002_R1" "MCL1.DL_BC2CTUACXX_ATCACG_L002_R1"
## [7] "MCL1.LA_BC2CTUACXX_GATCAG_L001_R1" "MCL1.LB_BC2CTUACXX_TGACCA_L001_R1"
## [9] "MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1" "MCL1.LD_BC2CTUACXX_GGCTAC_L001_R1"
## [11] "MCL1.LE_BC2CTUACXX_TAGCTT_L001_R1" "MCL1.LF_BC2CTUACXX_CTTGTA_L001_R1"
Let’s confirm that the metadata is aligend with the counts matrix:
all(counts_columns == metadata$FileName) # Should return TRUE
## [1] TRUE
corMatrix <- cor(counts[-c(1, 2)], use="c")
# Keep only the sample name (first 7 digits) for better visualization
colnames(corMatrix) <- substr(colnames(corMatrix), 1, 7)
rownames(corMatrix) <- substr(rownames(corMatrix), 1, 7)
head(corMatrix)
## MCL1.DG MCL1.DH MCL1.DI MCL1.DJ MCL1.DK MCL1.DL MCL1.LA
## MCL1.DG 1.0000000 0.9771420 0.8418633 0.8204474 0.8108604 0.8575194 0.7324363
## MCL1.DH 0.9771420 1.0000000 0.8708108 0.8490237 0.8120959 0.8377299 0.7324962
## MCL1.DI 0.8418633 0.8708108 1.0000000 0.9909929 0.9479076 0.9317133 0.5684151
## MCL1.DJ 0.8204474 0.8490237 0.9909929 1.0000000 0.9662422 0.9446826 0.5290221
## MCL1.DK 0.8108604 0.8120959 0.9479076 0.9662422 1.0000000 0.9857808 0.4971564
## MCL1.DL 0.8575194 0.8377299 0.9317133 0.9446826 0.9857808 1.0000000 0.5346671
## MCL1.LB MCL1.LC MCL1.LD MCL1.LE MCL1.LF
## MCL1.DG 0.7377906 0.2367517 0.2135546 0.05760311 0.06287370
## MCL1.DH 0.7443741 0.2457856 0.2217865 0.05957578 0.06436077
## MCL1.DI 0.5822673 0.2794939 0.2609729 0.13185247 0.13542179
## MCL1.DJ 0.5415909 0.2331733 0.2163069 0.09410234 0.09737514
## MCL1.DK 0.5061827 0.2139352 0.2003178 0.09435293 0.09756887
## MCL1.DL 0.5428115 0.2202740 0.2066864 0.09735470 0.10133613
pheatmap(corMatrix)
Now we will build the same correlation matrix but differenciating
between the 3 status (virgin, pregnant and lactate)
metadata$FileName <- colnames(corMatrix)
# Ensure the row names of metadata match the column names of corMatrix
rownames(metadata) <- metadata$FileName # Set row names to the sample names
metadata <- metadata[, "Status", drop = FALSE] # Keep only the Status column
all(rownames(metadata) == colnames(corMatrix))
## [1] TRUE
library(pheatmap)
# Plot the correlation matrix with annotation
pheatmap(
corMatrix,
main = "Correlation Heatmap by Status",
annotation_col = metadata, # Add Status annotation
show_rownames = FALSE, # Hide row names for readability
fontsize_col = 8,
angle_col = 45
)
The following correlation plot contains the correlation coefficients as
white numbers:
corrplot(corMatrix, order = 'hclust',
addrect = 2, addCoef.col = 'white',
number.cex = 0.7)
It is possible to observe higher correlations among samples from the
same status.
Before obtaining the PCA, it is necessary to transpose the counts matrix.
transposed_matrix <- t(counts[-c(1, 2)]) # Transpose the matrix
transposed_matrix <- log2(transposed_matrix + 1) # Transform the counts to log2 scale
pcaResults <- prcomp(transposed_matrix) # Obtain PCA
#Plot PCA results
autoplot(pcaResults, data = metadata, colour = 'Status')+
ggtitle("PCA plot")+
theme(plot.title = element_text(hjust = 0.5))
summary(pcaResults)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 142.1211 74.4452 42.1418 39.72937 29.06842 20.17033
## Proportion of Variance 0.6323 0.1735 0.0556 0.04941 0.02645 0.01274
## Cumulative Proportion 0.6323 0.8058 0.8614 0.91086 0.93731 0.95005
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 19.01924 18.42876 17.60980 17.26812 16.90820 5.188e-13
## Proportion of Variance 0.01132 0.01063 0.00971 0.00934 0.00895 0.000e+00
## Cumulative Proportion 0.96137 0.97201 0.98171 0.99105 1.00000 1.000e+00
First of all we filter the metadata to keep only those samples that correspond to virgin and pregnant mice.
metadata <- read.delim("C:/Users/IhonaCorrea/Desktop/MASTER/COTM/Activity_2/sample_metadata.txt", header = TRUE, sep = "\t")
subSamplInfo <- metadata[metadata$Status %in% c("virgin", "pregnant"), ]
We get the file names from the sub sampled metadata and we use them to obtain the columns from the counts matrix that match with that row names. By doing this, we are filtering the counts matrix to keep only samples from virgin and pregnant mice too.
# Get the column names from counts excluding the first two columns
counts_columns <- colnames(counts)[-c(1, 2)]
# Get the filenames from subSamplInfo
subsampl_files <- subSamplInfo$FileName
# Find the intersection of counts columns and filenames
matching_columns <- intersect(counts_columns, subsampl_files)
subCOUNTS <- counts[, colnames(counts) %in% matching_columns] # Not include the first 2 columns
rownames(subCOUNTS) <- counts$EntrezGeneID # Set EntrezGeneID as row names
head(subCOUNTS, n=3)
## MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1 MCL1.DH_BC2CTUACXX_CAGATC_L002_R1
## 497097 438 300
## 100503874 1 0
## 100038431 0 0
## MCL1.DI_BC2CTUACXX_ACAGTG_L002_R1 MCL1.DJ_BC2CTUACXX_CGATGT_L002_R1
## 497097 65 237
## 100503874 1 1
## 100038431 0 0
## MCL1.LA_BC2CTUACXX_GATCAG_L001_R1 MCL1.LB_BC2CTUACXX_TGACCA_L001_R1
## 497097 0 0
## 100503874 0 0
## 100038431 0 0
## MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1 MCL1.LD_BC2CTUACXX_GGCTAC_L001_R1
## 497097 0 0
## 100503874 0 0
## 100038431 0 0
Before constructing the DESeqDataset object, wee need to ensure that the row names from the metadata (subsampl_files) match and are in the same order as the column names from subCOUNTS ( the filtered counts matrix).
all(colnames(subCOUNTS) == subsampl_files) # has to be TRUE
## [1] TRUE
Also, we ensure that the Status, which is the column differenciating between virgin and pregnant mice, is treated as a factor.
subSamplInfo$Status <- as.factor(subSamplInfo$Status)
# construct object
dds <- DESeqDataSetFromMatrix(
countData = subCOUNTS,
colData = subSamplInfo,
design = ~ Status
)
print(dds)
## class: DESeqDataSet
## dim: 27179 8
## metadata(1): version
## assays(1): counts
## rownames(27179): 497097 100503874 ... 100861691 100504472
## rowData names(0):
## colnames(8): MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1
## MCL1.DH_BC2CTUACXX_CAGATC_L002_R1 ...
## MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1 MCL1.LD_BC2CTUACXX_GGCTAC_L001_R1
## colData names(4): FileName SampleName CellType Status
Certain functions can be used to access this information separately: rownames(dds), colnames(dds), counts(dds) and colData(dds).
colData(dds) # Example on how to access the information of the dds object
## DataFrame with 8 rows and 4 columns
## FileName SampleName
## <character> <character>
## MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1 MCL1.DG_BC2CTUACXX_A.. MCL1.DG
## MCL1.DH_BC2CTUACXX_CAGATC_L002_R1 MCL1.DH_BC2CTUACXX_C.. MCL1.DH
## MCL1.DI_BC2CTUACXX_ACAGTG_L002_R1 MCL1.DI_BC2CTUACXX_A.. MCL1.DI
## MCL1.DJ_BC2CTUACXX_CGATGT_L002_R1 MCL1.DJ_BC2CTUACXX_C.. MCL1.DJ
## MCL1.LA_BC2CTUACXX_GATCAG_L001_R1 MCL1.LA_BC2CTUACXX_G.. MCL1.LA
## MCL1.LB_BC2CTUACXX_TGACCA_L001_R1 MCL1.LB_BC2CTUACXX_T.. MCL1.LB
## MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1 MCL1.LC_BC2CTUACXX_G.. MCL1.LC
## MCL1.LD_BC2CTUACXX_GGCTAC_L001_R1 MCL1.LD_BC2CTUACXX_G.. MCL1.LD
## CellType Status
## <character> <factor>
## MCL1.DG_BC2CTUACXX_ACTTGA_L002_R1 luminal virgin
## MCL1.DH_BC2CTUACXX_CAGATC_L002_R1 basal virgin
## MCL1.DI_BC2CTUACXX_ACAGTG_L002_R1 basal pregnant
## MCL1.DJ_BC2CTUACXX_CGATGT_L002_R1 basal pregnant
## MCL1.LA_BC2CTUACXX_GATCAG_L001_R1 basal virgin
## MCL1.LB_BC2CTUACXX_TGACCA_L001_R1 luminal virgin
## MCL1.LC_BC2CTUACXX_GCCAAT_L001_R1 luminal pregnant
## MCL1.LD_BC2CTUACXX_GGCTAC_L001_R1 luminal pregnant
Next, we will remove genes that have almost no information in any sample.
dds <- dds[ rowSums(DESeq2::counts(dds)) > 3, ]
# We set the factor level to virgin mice
dds$Status <- relevel(dds$Status , ref = "virgin")
library(BiocParallel)
param <- MulticoreParam()
register(param)
dds <- DESeq(dds, parallel = T)
DEresults<- results(dds)
DEresults <- DEresults[order(DEresults$pvalue),] #sort results by increasing p-value
DEresults
## log2 fold change (MLE): Status pregnant vs virgin
## Wald test p-value: Status pregnant vs virgin
## DataFrame with 19220 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## 77097 3064.078 -1.980240 0.1532796 -12.9191 3.51065e-38 5.95968e-34
## 14758 1554.425 1.464270 0.1248113 11.7319 8.74983e-32 7.42686e-28
## 20656 2783.784 0.802968 0.0687741 11.6754 1.70172e-31 9.62949e-28
## 101214 3224.373 -0.736428 0.0693145 -10.6244 2.29373e-26 9.73460e-23
## 72157 944.784 0.765255 0.0813538 9.4065 5.12917e-21 1.74146e-17
## ... ... ... ... ... ... ...
## 246133 19.94624 -3.75980 2.44800 -1.53586 NA NA
## 13184 4.26773 -4.63749 3.39175 -1.36729 NA NA
## 76507 21.67145 -2.00418 1.10862 -1.80782 NA NA
## 23993 11.32434 -1.74899 1.68251 -1.03951 NA NA
## 19259 26.89530 -7.32684 2.77443 -2.64084 NA NA
This MA plot is useful to observe if the data normalization worked well. Most points are expected to be on the horizontal 0 line, since most genes are not expected to be differentially expressed.
# MA plot
DESeq2::plotMA(object = dds, ylim = c(-5, 5), main = "MA plot virgin-pregnant")
For the p-value distribution, we expect to see a peak around low
p-values and a uniform distribution at P-values above 0.1.
# P-value distribution
ggplot(data = as.data.frame(DEresults), aes(x = pvalue)) +
geom_histogram(bins = 100)+
ggtitle("p-value distribution virgin-pregnant") +
theme(plot.title = element_text(hjust = 0.5))
# PCA plot
rld <- rlog(dds)
DESeq2::plotPCA(rld, ntop = 500, intgroup = 'Status') +
ylim(-50, 50) + theme_bw()+
ggtitle("PCA plot virgin-pregnant")+
theme(plot.title = element_text(hjust = 0.5))
## using ntop=500 top features by variance
### 6. Visualization
library(org.Mm.eg.db) # Genome wide annotation for mice
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:clusterProfiler':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
##
# Save the DEG results in a dataframe
DEG.vir.preg <- data.frame(DEresults$log2FoldChange, DEresults$lfcSE, DEresults$pvalue,
DEresults$padj, rownames(DEresults)) %>% `colnames<-`(c('log2FoldChange', 'lfcSE', 'pvalue', 'padj', 'EntrezGeneID'))
# Map the EntrezGeneID column to a reference mice database
DEG.vir.preg$GenSymbl<-mapIds(org.Mm.eg.db,keys=DEG.vir.preg$EntrezGeneID,column="SYMBOL",
keytype="ENTREZID", multiVals="first")
## 'select()' returned 1:1 mapping between keys and columns
head(DEG.vir.preg)
## log2FoldChange lfcSE pvalue padj EntrezGeneID GenSymbl
## 1 -1.9802400 0.15327961 3.510653e-38 5.959684e-34 77097 Tanc2
## 2 1.4642702 0.12481127 8.749830e-32 7.426856e-28 14758 Gpm6b
## 3 0.8029683 0.06877411 1.701724e-31 9.629491e-28 20656 Sod2
## 4 -0.7364281 0.06931450 2.293733e-26 9.734604e-23 101214 Tra2a
## 5 0.7652548 0.08135380 5.129172e-21 1.741456e-17 72157 Pgm1
## 6 2.1195380 0.24194413 1.944831e-18 5.502576e-15 433022 Plcxd2
The significantly differentially expressed genes are the ones found in the upper-left and upper-right corners. We are going to add a column to the data frame (named diffexpressed) to specify if they are UP- or DOWN- regulated (which depends if log2FoldChange is positive or negative).
# Set the thesholds for being up or down regulated
F2CLim <- 2
PadjLim <- 0.05
A log2 fold change of 2 corresponds to a 4-fold increase in gene expression, which can be considered a substantial change that reflects meaningful biological shifts. Moreover, a p-value of 0.05 is a common accepted significance level. If the changes are below the 0.05 threshold, it means they are statistically significant and not due to random chance.
DEG.vir.preg$diffexpressed <- "NO" # First we categorized all genes into not differencially expressed
# if log2Foldchange > F2CLim and Padj < PadjLim, set as "UP"
DEG.vir.preg$diffexpressed[DEG.vir.preg$log2FoldChange > F2CLim & DEG.vir.preg$padj < PadjLim ] <- "UP"
# if log2Foldchange < -F2CLim and Padj < PadjLim, set as "DOWN"
DEG.vir.preg$diffexpressed[DEG.vir.preg$log2FoldChange < -F2CLim & DEG.vir.preg$padj < PadjLim] <- "DOWN"
Now we perform a Volcano plot and color-code it based on if the genes are up or down regulated:
p <- ggplot(data=DEG.vir.preg, aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed)) +
geom_point() + theme_minimal()
p2 <- p + geom_vline(xintercept=c(-F2CLim, F2CLim), col="red") +
geom_hline(yintercept=-log10(PadjLim), col="red")
## Change point color
p3 <- p2 + scale_color_manual(values=c("blue", "black", "red"))
mycolors <- c("blue", "red", "black")
names(mycolors) <- c("DOWN", "UP", "NO")
p3 <- p2 + scale_colour_manual(values = mycolors)+
ggtitle("Volcano plot virgin-pregnant") +
theme(plot.title = element_text(hjust = 0.5))
p3
Next, the same Volcano plot but with the names of the genes next to each point will be assessed:
# Create a new column "delabel" that contains the name of genes differentially expressed (NA in case they are not)
DEG.vir.preg$delabel <- NA
DEG.vir.preg$delabel[DEG.vir.preg$diffexpressed != "NO"] <- DEG.vir.preg$GenSymbl[DEG.vir.preg$diffexpressed != "NO"]
#Organize the labels using the "ggrepel" package and the geom_text_repel() function
library(ggrepel)
ggplot(data=DEG.vir.preg, aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed, label=delabel)) +
geom_point() +
theme_minimal() +
geom_text_repel() +
scale_color_manual(values=c("blue", "black", "red")) +
geom_vline(xintercept=c(-F2CLim, F2CLim), col="red") +
geom_hline(yintercept=-log10(PadjLim), col="red")+
ggtitle("Volcano plot virgin-pregnant") +
theme(plot.title = element_text(hjust = 0.5))
We save the results:
UPR.vir.preg <- DEG.vir.preg[DEG.vir.preg$diffexpressed=="UP",]
DWR.vir.preg <- DEG.vir.preg[DEG.vir.preg$diffexpressed=="DOWN",]
head(UPR.vir.preg, n=3)
## log2FoldChange lfcSE pvalue padj EntrezGeneID GenSymbl
## 6 2.119538 0.2419441 1.944831e-18 5.502576e-15 433022 Plcxd2
## 16 2.010375 0.2673337 5.473635e-14 5.807526e-11 170706 Tmem37
## 20 7.700666 1.0743842 7.637067e-13 6.482342e-10 14077 Fabp3
## diffexpressed delabel
## 6 UP Plcxd2
## 16 UP Tmem37
## 20 UP Fabp3
First of all we retrieve the mouse genes from the reference library and we map them to our EntrezGeneID to ensure the IDs used in downstream analysis are valid and consistent with the genome annotation database.
# Define the universe genes for enrichment analysis
df <- as.data.frame(org.Mm.egGO)
go_gene_list <- unique(sort(df$gene_id)) # Mouse universe genes
head(df)
## gene_id go_id Evidence Ontology
## 1 11287 GO:0007566 IGI BP
## 2 11287 GO:0010466 IEA BP
## 3 11298 GO:0006474 ISO BP
## 4 11298 GO:0007623 IBA BP
## 5 11298 GO:0007623 ISO BP
## 6 11298 GO:0009416 IBA BP
# Obtain ENTREZ ID of the genes previously extracted
ent.Gene <- mapIds(
org.Mm.eg.db,
keys = DWR.vir.preg$EntrezGeneID,
keytype = "ENTREZID",
column = "ENTREZID",
multiVals = "first"
)
ego <- enrichGO(gene = na.omit(ent.Gene),
universe = go_gene_list,
OrgDb = org.Mm.eg.db,
ont = "all",
pAdjustMethod = "BH",
pvalueCutoff = 0.10,
qvalueCutoff = 0.05,
readable = TRUE)
head(ego)# Reference
## ONTOLOGY ID Description
## GO:0050767 BP GO:0050767 regulation of neurogenesis
## GO:0014009 BP GO:0014009 glial cell proliferation
## GO:0014013 BP GO:0014013 regulation of gliogenesis
## GO:0014015 BP GO:0014015 positive regulation of gliogenesis
## GO:0030856 BP GO:0030856 regulation of epithelial cell differentiation
## GO:1903034 BP GO:1903034 regulation of response to wounding
## GeneRatio BgRatio RichFactor FoldEnrichment zScore pvalue
## GO:0050767 19/184 494/28905 0.03846154 6.042015 9.047164 4.719877e-10
## GO:0014009 9/184 79/28905 0.11392405 17.896602 12.036738 1.978310e-09
## GO:0014013 10/184 143/28905 0.06993007 10.985482 9.581105 2.938305e-08
## GO:0014015 8/184 93/28905 0.08602151 13.513324 9.674211 1.486042e-07
## GO:0030856 10/184 173/28905 0.05780347 9.080485 8.532291 1.772410e-07
## GO:1903034 10/184 176/28905 0.05681818 8.925704 8.441547 2.080162e-07
## p.adjust qvalue
## GO:0050767 1.110587e-06 9.285736e-07
## GO:0014009 2.327482e-06 1.946033e-06
## GO:0014013 2.304611e-05 1.926910e-05
## GO:0014015 8.157702e-05 6.820742e-05
## GO:0030856 8.157702e-05 6.820742e-05
## GO:1903034 8.157702e-05 6.820742e-05
## geneID
## GO:0050767 Abcc8/Map1b/Id1/Cxcr4/Il33/Dpysl5/Hey2/Ptprz1/Tnfrsf1b/Cysltr1/Ntn1/Fgfr3/Hey1/Sema3b/Dscam/Wnt7b/Plag1/Trpc6/Ptn
## GO:0014009 Abcc8/Il33/Areg/Cysltr1/Ntn1/Lepr/Cntnap2/Plag1/Ptn
## GO:0014013 Abcc8/Cxcr4/Il33/Ptprz1/Tnfrsf1b/Cysltr1/Ntn1/Fgfr3/Plag1/Ptn
## GO:0014015 Cxcr4/Il33/Ptprz1/Tnfrsf1b/Cysltr1/Ntn1/Plag1/Ptn
## GO:0030856 Id1/Dmbt1/Prom1/Hey2/Eppk1/Fgfr3/Hey1/Cited1/Reg3g/Proc
## GO:1903034 Tspan8/Vwf/Fgb/Cxcr4/Il33/Eppk1/Mmrn1/Reg3g/Proc/Ptn
## Count
## GO:0050767 19
## GO:0014009 9
## GO:0014013 10
## GO:0014015 8
## GO:0030856 10
## GO:1903034 10
## Remove redundent GO terms
ego2 <- clusterProfiler::simplify(ego)
head(summary(ego2), n=3)
## Warning in summary(ego2): summary method to convert the object to data.frame is
## deprecated, please use as.data.frame instead.
## ONTOLOGY ID Description
## GO:0050767 BP GO:0050767 regulation of neurogenesis
## GO:0014009 BP GO:0014009 glial cell proliferation
## GO:0030856 BP GO:0030856 regulation of epithelial cell differentiation
## GeneRatio BgRatio RichFactor FoldEnrichment zScore pvalue
## GO:0050767 19/184 494/28905 0.03846154 6.042015 9.047164 4.719877e-10
## GO:0014009 9/184 79/28905 0.11392405 17.896602 12.036738 1.978310e-09
## GO:0030856 10/184 173/28905 0.05780347 9.080485 8.532291 1.772410e-07
## p.adjust qvalue
## GO:0050767 1.110587e-06 9.285736e-07
## GO:0014009 2.327482e-06 1.946033e-06
## GO:0030856 8.157702e-05 6.820742e-05
## geneID
## GO:0050767 Abcc8/Map1b/Id1/Cxcr4/Il33/Dpysl5/Hey2/Ptprz1/Tnfrsf1b/Cysltr1/Ntn1/Fgfr3/Hey1/Sema3b/Dscam/Wnt7b/Plag1/Trpc6/Ptn
## GO:0014009 Abcc8/Il33/Areg/Cysltr1/Ntn1/Lepr/Cntnap2/Plag1/Ptn
## GO:0030856 Id1/Dmbt1/Prom1/Hey2/Eppk1/Fgfr3/Hey1/Cited1/Reg3g/Proc
## Count
## GO:0050767 19
## GO:0014009 9
## GO:0030856 10
ego@result$Description <- stringr::str_wrap(ego@result$Description, width = 100)
# Dotplot with improved readability
dotplot(ego, split = "ONTOLOGY", showCategory = 5) + # Create a dot plot of enriched terms from the ego object, displaying the top 5
facet_grid(ONTOLOGY ~ ., scales = "free") + # splits the plot into facets for each ontology
ggtitle("Dotplot for DOWN-regulated genes") +
theme(
panel.spacing = unit(1, "lines"), # Space between facets
axis.text.y = element_text( hjust = 1, size = 7),
plot.title = element_text(hjust = 0.5) # Put title in the center
)
cnetplot(ego2, showCategory = 3, foldChange = DWR.vir.preg$log2FoldChange, circular = TRUE, colorEdge = TRUE)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
## The foldChange parameter will be removed in the next version.
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(edge = your_value)' instead of 'colorEdge'.
## The colorEdge parameter will be removed in the next version.
p <- upsetplot(ego2, n = 5)
p + scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, 2)) +
theme_minimal() + labs(y = "# Genes", x = "Intersect GOterms")+
ggtitle("Intersection of Top 5 GO Terms for DOWN-regulated") +
theme(plot.title = element_text(hjust = 0.5))
edox <- pairwise_termsim(ego)
p1 <- treeplot(edox)
p1 + ggtitle("Treeplot of Enriched Terms for DOWN-regulated genes") +
theme(
plot.title = element_text(hjust = 0.5)
)
p2 <- treeplot(edox, hclust_method = "average")
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
## The hclust_method parameter will be removed in the next version.
p2 + ggtitle("Clustered Treeplot for DOWN-regulated genes") +
theme(
plot.title = element_text(hjust = 0.5)
)
# Obtain ENTREZ ID of the genes previously extracted
ent.Gene <- mapIds(
org.Mm.eg.db,
keys = UPR.vir.preg$EntrezGeneID, # Specify up-regulated
keytype = "ENTREZID",
column = "ENTREZID",
multiVals = "first"
)
ego <- enrichGO(gene = na.omit(ent.Gene),
universe = go_gene_list,
OrgDb = org.Mm.eg.db,
ont = "all",
pAdjustMethod = "BH",
pvalueCutoff = 0.10,
qvalueCutoff = 0.05,
readable = TRUE)
head(ego)# Reference
## ONTOLOGY ID Description
## GO:0015718 BP GO:0015718 monocarboxylic acid transport
## GO:0015711 BP GO:0015711 organic anion transport
## GO:0046942 BP GO:0046942 carboxylic acid transport
## GO:0015849 BP GO:0015849 organic acid transport
## GO:0015909 BP GO:0015909 long-chain fatty acid transport
## GO:1905954 BP GO:1905954 positive regulation of lipid localization
## GeneRatio BgRatio RichFactor FoldEnrichment zScore pvalue
## GO:0015718 10/96 182/28905 0.05494505 16.543613 12.142923 5.396754e-10
## GO:0015711 12/96 451/28905 0.02660754 8.011364 8.663026 3.443028e-08
## GO:0046942 11/96 374/28905 0.02941176 8.855699 8.826990 4.853190e-08
## GO:0015849 11/96 376/28905 0.02925532 8.808594 8.797798 5.122677e-08
## GO:0015909 5/96 68/28905 0.07352941 22.139246 10.074405 3.213174e-06
## GO:1905954 6/96 126/28905 0.04761905 14.337798 8.661279 4.094145e-06
## p.adjust qvalue
## GO:0015718 9.066547e-07 7.231651e-07
## GO:0015711 2.151524e-05 1.716097e-05
## GO:0046942 2.151524e-05 1.716097e-05
## GO:0015849 2.151524e-05 1.716097e-05
## GO:0015909 1.079626e-03 8.611306e-04
## GO:1905954 1.146361e-03 9.143590e-04
## geneID
## GO:0015718 Fabp3/Slc26a7/Drd4/Acsl1/Slc16a12/Slc16a9/Abcb11/Abcg2/Plin2/Cd36
## GO:0015711 Fabp3/Slc26a7/Slc38a3/Drd4/Acsl1/Slc16a12/Slc16a9/Abcb11/Abcg2/Plin2/Cd36/Slc52a2
## GO:0046942 Fabp3/Slc26a7/Slc38a3/Drd4/Acsl1/Slc16a12/Slc16a9/Abcb11/Abcg2/Plin2/Cd36
## GO:0015849 Fabp3/Slc26a7/Slc38a3/Drd4/Acsl1/Slc16a12/Slc16a9/Abcb11/Abcg2/Plin2/Cd36
## GO:0015909 Fabp3/Drd4/Acsl1/Plin2/Cd36
## GO:1905954 Fabp3/Lpl/Dab2/Acsl1/Plin2/Cd36
## Count
## GO:0015718 10
## GO:0015711 12
## GO:0046942 11
## GO:0015849 11
## GO:0015909 5
## GO:1905954 6
## remove redundent GO terms
ego2 <- clusterProfiler::simplify(ego)
head(summary(ego2), n=2)
## Warning in summary(ego2): summary method to convert the object to data.frame is
## deprecated, please use as.data.frame instead.
## ONTOLOGY ID Description GeneRatio
## GO:0015718 BP GO:0015718 monocarboxylic acid transport 10/96
## GO:0015849 BP GO:0015849 organic acid transport 11/96
## BgRatio RichFactor FoldEnrichment zScore pvalue
## GO:0015718 182/28905 0.05494505 16.543613 12.142923 5.396754e-10
## GO:0015849 376/28905 0.02925532 8.808594 8.797798 5.122677e-08
## p.adjust qvalue
## GO:0015718 9.066547e-07 7.231651e-07
## GO:0015849 2.151524e-05 1.716097e-05
## geneID
## GO:0015718 Fabp3/Slc26a7/Drd4/Acsl1/Slc16a12/Slc16a9/Abcb11/Abcg2/Plin2/Cd36
## GO:0015849 Fabp3/Slc26a7/Slc38a3/Drd4/Acsl1/Slc16a12/Slc16a9/Abcb11/Abcg2/Plin2/Cd36
## Count
## GO:0015718 10
## GO:0015849 11
ego@result$Description <- stringr::str_wrap(ego@result$Description, width = 100)
# Dotplot with improved readability
dotplot(ego, split = "ONTOLOGY", showCategory = 5) + # Create a dot plot of enriched terms from the ego object, displaying the top 5
facet_grid(ONTOLOGY ~ ., scales = "free") + # splits the plot into facets for each ontology
ggtitle("Dotplot for UP-regulated genes") +
theme(
panel.spacing = unit(1, "lines"), # Space between facets
axis.text.y = element_text( hjust = 1, size = 7), # Rotate y-axis labels
plot.title = element_text(hjust = 0.5) # Put title in the center
)
cnetplot(ego2, showCategory = 3, foldChange = UPR.vir.preg$log2FoldChange, circular = TRUE, colorEdge = TRUE, max.overlaps = 100)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
## The foldChange parameter will be removed in the next version.
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(edge = your_value)' instead of 'colorEdge'.
## The colorEdge parameter will be removed in the next version.
p <- upsetplot(ego2, n = 5)
p + scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, 2)) +
theme_minimal() + labs(y = "# Genes", x = "Intersect GOterms")+
ggtitle("Intersection of Top 5 GO Terms for UP-regulated genes") +
theme(plot.title = element_text(hjust = 0.5))
edox <- pairwise_termsim(ego)
p1 <- treeplot(edox)
p1 + ggtitle("Treeplot of Enriched Terms for UP-regulated genes") +
theme(
plot.title = element_text(hjust = 0.5)
)
p2 <- treeplot(edox, hclust_method = "average")
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
## The hclust_method parameter will be removed in the next version.
p2 + ggtitle("Clustered Treeplot for UP-regulated genes") +
theme(
plot.title = element_text(hjust = 0.5)
)
metadata <- read.delim("sample_metadata.txt", header = TRUE, sep = "\t") # Load metadata again
subSamplInfo <- metadata[metadata$Status %in% c("virgin", "lactate"), ] # Filter to virgin and lactating mice
# Get the column names from counts excluding the first two columns
counts_columns <- colnames(counts)[-c(1, 2)]
# Get the filenames from subSamplInfo
subsampl_files <- subSamplInfo$FileName
# Find the intersection of counts columns and filenames
matching_columns <- intersect(counts_columns, subsampl_files)
subCOUNTS <- counts[, colnames(counts) %in% matching_columns] # Not include the first 2 columns
rownames(subCOUNTS) <- counts$EntrezGeneID # Set EntrezGeneID as row names
See if the rows of the metadata are aligned with the columns of the counts matrix:
all(colnames(subCOUNTS) == subsampl_files) # has to be TRUE
## [1] TRUE
subSamplInfo$Status <- as.factor(subSamplInfo$Status) # Status treated as a factor
# construct a DESeqDataset object
dds <- DESeqDataSetFromMatrix(
countData = subCOUNTS,
colData = subSamplInfo,
design = ~ Status
)
# Pre-filtering
dds <- dds[ rowSums(DESeq2::counts(dds)) > 3, ]
# We set the factor level to virgin mice
dds$Status <- relevel(dds$Status , ref = "virgin")
# Run DESeq
library(BiocParallel)
param <- MulticoreParam()
register(param)
dds <- DESeq(dds, parallel = T)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 1 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 1 workers
DEresults<- results(dds)
#sort results by increasing p-value
DEresults <- DEresults[order(DEresults$pvalue),]
DEresults
## log2 fold change (MLE): Status lactate vs virgin
## Wald test p-value: Status lactate vs virgin
## DataFrame with 19125 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## 59095 499.919 -3.74548 0.286908 -13.0546 5.98222e-39
## 67111 395.788 2.23399 0.171645 13.0152 1.00335e-38
## 170459 483.021 2.52916 0.200609 12.6074 1.92263e-36
## 77697 236.771 1.63310 0.148234 11.0170 3.16256e-28
## 93842 338.688 -2.43362 0.222782 -10.9238 8.87127e-28
## ... ... ... ... ... ...
## 68867 3158.769336 1.06118e-04 0.909386 1.16692e-04 0.999907
## 268354 0.785158 2.18907e-04 2.644676 8.27729e-05 0.999934
## 666202 1.850182 6.53647e-05 1.083369 6.03347e-05 0.999952
## 76507 15.145366 -4.71345e+00 1.464829 -3.21774e+00 NA
## 19259 23.718074 -4.44241e+00 2.481774 -1.79001e+00 NA
## padj
## <numeric>
## 59095 8.47729e-35
## 67111 8.47729e-35
## 170459 1.08295e-32
## 77697 1.33602e-24
## 93842 2.99813e-24
## ... ...
## 68867 0.999907
## 268354 NA
## 666202 NA
## 76507 NA
## 19259 NA
Let’s plot the diagnostic plots explained before:
# MA plot
DESeq2::plotMA(object = dds, ylim = c(-5, 5), main = "MA plot virgin-lactate")
# P-value distribution
ggplot(data = as.data.frame(DEresults), aes(x = pvalue)) +
geom_histogram(bins = 100)+
ggtitle("p-vale distribution virgin-lactate") +
theme(plot.title = element_text(hjust = 0.5))
rld <- rlog(dds)
DESeq2::plotPCA(rld, ntop = 500, intgroup = 'Status') +
ylim(-50, 50) + theme_bw()+
ggtitle("PCA plot virgin-lactate")
## using ntop=500 top features by variance
theme(plot.title = element_text(hjust = 0.5))
## List of 1
## $ plot.title:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0.5
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
library(org.Mm.eg.db) # Genome wide annotation for mice
DEG.vir.lact <- data.frame(DEresults$log2FoldChange, DEresults$lfcSE, DEresults$pvalue,
DEresults$padj, rownames(DEresults)) %>% `colnames<-`(c('log2FoldChange', 'lfcSE', 'pvalue', 'padj', 'EntrezGeneID'))
DEG.vir.lact$GenSymbl<-mapIds(org.Mm.eg.db,keys=DEG.vir.lact$EntrezGeneID,column="SYMBOL",
keytype="ENTREZID", multiVals="first")
## 'select()' returned 1:1 mapping between keys and columns
head(DEG.vir.lact)
## log2FoldChange lfcSE pvalue padj EntrezGeneID GenSymbl
## 1 -3.7454754 0.28690844 5.982219e-39 8.477291e-35 59095 Fxyd6
## 2 2.2339923 0.17164543 1.003348e-38 8.477291e-35 67111 Naaa
## 3 2.5291587 0.20060927 1.922630e-36 1.082953e-32 170459 Stard4
## 4 1.6330980 0.14823370 3.162558e-28 1.336023e-24 77697 Mmab
## 5 -2.4336216 0.22278175 8.871268e-28 2.998134e-24 93842 Igsf9
## 6 0.9775204 0.09243579 3.886413e-26 9.978270e-23 74252 Armc1
DEG.vir.lact$diffexpressed <- "NO"
# if log2Foldchange > F2CLim and Padj < PadjLim, set as "UP"
DEG.vir.lact$diffexpressed[DEG.vir.lact$log2FoldChange > F2CLim & DEG.vir.lact$padj < PadjLim ] <- "UP"
# if log2Foldchange < -F2CLim and Padj < PadjLim, set as "DOWN"
DEG.vir.lact$diffexpressed[DEG.vir.lact$log2FoldChange < -F2CLim & DEG.vir.lact$padj < PadjLim] <- "DOWN"
p <- ggplot(data=DEG.vir.preg, aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed)) +
geom_point() + theme_minimal()
p2 <- p + geom_vline(xintercept=c(-F2CLim, F2CLim), col="red") +
geom_hline(yintercept=-log10(PadjLim), col="red")
## Change point color
p3 <- p2 + scale_color_manual(values=c("blue", "black", "red"))
mycolors <- c("blue", "red", "black")
names(mycolors) <- c("DOWN", "UP", "NO")
p3 <- p2 + scale_colour_manual(values = mycolors)+
ggtitle("Volcano plot virgin-lactate") +
theme(plot.title = element_text(hjust = 0.5))
p3
## Warning: Removed 2244 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Plot with the name of genes beside the points
# Create a new column "delabel"
DEG.vir.lact$delabel <- NA
DEG.vir.lact$delabel[DEG.vir.lact$diffexpressed != "NO"] <- DEG.vir.lact$GenSymbl[DEG.vir.lact$diffexpressed != "NO"]
# Organize the labels using the "ggrepel" package and geom_text_repel() function
library(ggrepel)
ggplot(data=DEG.vir.lact, aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed, label=delabel)) +
geom_point() +
theme_minimal() +
geom_text_repel() +
scale_color_manual(values=c("blue", "black", "red")) +
geom_vline(xintercept=c(-F2CLim, F2CLim), col="red") +
geom_hline(yintercept=-log10(PadjLim), col="red") +
ggtitle("Volcano plot virgin-lactate") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 2227 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 18456 rows containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 629 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Save the results:
UPR.vir.lact <- DEG.vir.lact[DEG.vir.lact$diffexpressed=="UP",]
DWR.vir.lact <- DEG.vir.lact[DEG.vir.lact$diffexpressed=="DOWN",]
# Define the universe genes for enrichment analysis
df <- as.data.frame(org.Mm.egGO)
go_gene_list <- unique(sort(df$gene_id)) # Mouse universe genes
# Obtain ENTREZ ID of the genes previously extracted
ent.Gene <- mapIds(
org.Mm.eg.db,
keys = DWR.vir.lact$EntrezGeneID, # Down regulated Virgin-lactating
keytype = "ENTREZID",
column = "ENTREZID",
multiVals = "first"
)
ego <- enrichGO(gene = na.omit(ent.Gene),
universe = go_gene_list,
OrgDb = org.Mm.eg.db,
ont = "all",
pAdjustMethod = "BH",
pvalueCutoff = 0.10,
qvalueCutoff = 0.05,
readable = TRUE)
head(ego)# Reference
## ONTOLOGY ID Description GeneRatio
## GO:0050767 BP GO:0050767 regulation of neurogenesis 24/308
## GO:0003205 BP GO:0003205 cardiac chamber development 12/308
## GO:0072001 BP GO:0072001 renal system development 16/308
## GO:0015850 BP GO:0015850 organic hydroxy compound transport 15/308
## GO:0010038 BP GO:0010038 response to metal ion 15/308
## GO:0046879 BP GO:0046879 hormone secretion 17/308
## BgRatio RichFactor FoldEnrichment zScore pvalue
## GO:0050767 494/28905 0.04858300 4.559388 8.281131 8.577351e-10
## GO:0003205 200/28905 0.06000000 5.630844 6.820096 1.778030e-06
## GO:0072001 359/28905 0.04456825 4.182614 6.297275 1.827804e-06
## GO:0015850 317/28905 0.04731861 4.440729 6.392670 1.849071e-06
## GO:0010038 318/28905 0.04716981 4.426764 6.376870 1.922227e-06
## GO:0046879 405/28905 0.04197531 3.939274 6.182145 1.963552e-06
## p.adjust qvalue
## GO:0050767 2.701008e-06 2.110931e-06
## GO:0003205 9.242665e-04 7.223463e-04
## GO:0072001 9.242665e-04 7.223463e-04
## GO:0015850 9.242665e-04 7.223463e-04
## GO:0010038 9.242665e-04 7.223463e-04
## GO:0046879 9.242665e-04 7.223463e-04
## geneID
## GO:0050767 Cit/Bmp2/Id1/Tnfrsf1b/Il33/Abcc8/Dpysl5/Heyl/Wnt7b/Trpc6/Rassf10/Cysltr1/Hey1/Hey2/Sema3b/Fgfr3/Myb/Bhlhe41/Fgf13/Cxcr4/Bex1/Lpar3/Ptn/Dscam
## GO:0003205 Stra6/Bmp2/Cpe/Heyl/Smad7/Hey1/Gata6/Hey2/Tnnt2/Cxcr4/Bmp5/Rbp4
## GO:0072001 Stra6/Bmp2/Id3/Heyl/Wnt7b/Lgr5/Smad7/Hey1/Tacstd2/Gcnt4/Upk3a/Hc/Prom1/Cxcr4/Cfh/Rbp4
## GO:0015850 Stra6/Slc22a3/Stra6l/Kcnb1/Slco1a5/Nat8l/Lgals3/C1qtnf1/Abcc3/Myb/Slco1a4/Syt15/Syt8/Syt5/Rbp4
## GO:0010038 Alox15/Abcc8/Cybrd1/Kcnb1/Crip1/Adcy7/Fgb/Tnnt2/Hamp/Chp2/Slc30a3/Fgg/Syt15/Syt8/Syt5
## GO:0046879 F2rl1/Il11/Lepr/Tnfsf11/Abcc8/Cpe/Kcnb1/C1qtnf1/Kcnj11/Fgb/Il1rn/Myb/Ildr2/Trh/Ffar4/Fgg/Rbp4
## Count
## GO:0050767 24
## GO:0003205 12
## GO:0072001 16
## GO:0015850 15
## GO:0010038 15
## GO:0046879 17
## remove redundent GO terms
ego2 <- clusterProfiler::simplify(ego)
head(summary(ego2), n=2)
## Warning in summary(ego2): summary method to convert the object to data.frame is
## deprecated, please use as.data.frame instead.
## ONTOLOGY ID Description GeneRatio BgRatio
## GO:0050767 BP GO:0050767 regulation of neurogenesis 24/308 494/28905
## GO:0003205 BP GO:0003205 cardiac chamber development 12/308 200/28905
## RichFactor FoldEnrichment zScore pvalue p.adjust
## GO:0050767 0.048583 4.559388 8.281131 8.577351e-10 2.701008e-06
## GO:0003205 0.060000 5.630844 6.820096 1.778030e-06 9.242665e-04
## qvalue
## GO:0050767 2.110931e-06
## GO:0003205 7.223463e-04
## geneID
## GO:0050767 Cit/Bmp2/Id1/Tnfrsf1b/Il33/Abcc8/Dpysl5/Heyl/Wnt7b/Trpc6/Rassf10/Cysltr1/Hey1/Hey2/Sema3b/Fgfr3/Myb/Bhlhe41/Fgf13/Cxcr4/Bex1/Lpar3/Ptn/Dscam
## GO:0003205 Stra6/Bmp2/Cpe/Heyl/Smad7/Hey1/Gata6/Hey2/Tnnt2/Cxcr4/Bmp5/Rbp4
## Count
## GO:0050767 24
## GO:0003205 12
Dot plot:
ego@result$Description <- stringr::str_wrap(ego@result$Description, width = 100)
# Dotplot with improved readability
dotplot(ego, split = "ONTOLOGY", showCategory = 5) + # Create a dot plot of enriched terms from the ego object, displaying the top 5
facet_grid(ONTOLOGY ~ ., scales = "free") + # splits the plot into facets for each ontology
ggtitle("Dotplot for DOWN-regulated genes") +
theme(
panel.spacing = unit(1, "lines"), # Space between facets
axis.text.y = element_text( hjust = 1, size = 7), # Rotate y-axis labels
plot.title = element_text(hjust = 0.5) # Put title in the center
)
Gene concept network:
cnetplot(ego2, showCategory = 3, foldChange = DWR.vir.lact$log2FoldChange, circular = TRUE, colorEdge = TRUE)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
## The foldChange parameter will be removed in the next version.
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(edge = your_value)' instead of 'colorEdge'.
## The colorEdge parameter will be removed in the next version.
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p <- upsetplot(ego2, n = 5)
p + scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, 2)) +
theme_minimal() + labs(y = "# Genes", x = "Intersect GOterms")+
ggtitle("Intersection of Top 5 GO Terms for DOWN-regulated genes") +
theme(plot.title = element_text(hjust = 0.5))
Tree plot:
edox <- pairwise_termsim(ego)
p1 <- treeplot(edox)
p1 + ggtitle("Treeplot of Enriched Terms for DOWN-regulated genes") +
theme(
plot.title = element_text(hjust = 0.5)
)
p2 <- treeplot(edox, hclust_method = "average")
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
## The hclust_method parameter will be removed in the next version.
p2 + ggtitle("Clustered Treeplot of Enriched Terms for DOWN-regulated genes") +
theme(
plot.title = element_text(hjust = 0.5)
)
# Obtain ENTREZ ID of the genes previously extracted
ent.Gene <- mapIds(
org.Mm.eg.db,
keys = UPR.vir.lact$EntrezGeneID,
keytype = "ENTREZID",
column = "ENTREZID",
multiVals = "first"
)
ego <- enrichGO(gene = na.omit(ent.Gene),
universe = go_gene_list,
OrgDb = org.Mm.eg.db,
ont = "all",
pAdjustMethod = "BH",
pvalueCutoff = 0.10,
qvalueCutoff = 0.05,
readable = TRUE)
head(ego)# Reference
## ONTOLOGY ID Description
## GO:0006631 BP GO:0006631 fatty acid metabolic process
## GO:0006633 BP GO:0006633 fatty acid biosynthetic process
## GO:0072330 BP GO:0072330 monocarboxylic acid biosynthetic process
## GO:0046394 BP GO:0046394 carboxylic acid biosynthetic process
## GO:0016053 BP GO:0016053 organic acid biosynthetic process
## GO:0006084 BP GO:0006084 acetyl-CoA metabolic process
## GeneRatio BgRatio RichFactor FoldEnrichment zScore pvalue
## GO:0006631 32/331 465/28905 0.06881720 6.009551 11.72109 5.956564e-16
## GO:0006633 20/331 162/28905 0.12345679 10.781023 13.43641 3.653093e-15
## GO:0072330 22/331 228/28905 0.09649123 8.426220 12.11646 2.730764e-14
## GO:0046394 24/331 325/28905 0.07384615 6.448710 10.63192 6.549035e-13
## GO:0016053 24/331 328/28905 0.07317073 6.389728 10.56581 7.981248e-13
## GO:0006084 10/331 36/28905 0.27777778 24.257301 15.02803 6.610351e-12
## p.adjust qvalue
## GO:0006631 1.829857e-12 1.522999e-12
## GO:0006633 5.611151e-12 4.670191e-12
## GO:0072330 2.796302e-11 2.327377e-11
## GO:0046394 4.903679e-10 4.081358e-10
## GO:0016053 4.903679e-10 4.081358e-10
## GO:0006084 3.384500e-09 2.816937e-09
## geneID
## GO:0006631 Naaa/Gstm7/Fabp3/Lpl/Abcb11/Olah/Insig1/Acsl1/Acacb/Acss2/Scd1/Fads1/Elovl5/Fasn/Scd4/Elovl6/Scd2/Acaca/Ucp3/Pla2g15/Ehhadh/Acly/Acat2/Prkaa2/Ptgs1/Pla2g4f/Acat3/Ppargc1a/Elovl1/Scd3/Acot6/Lpin1
## GO:0006633 Gstm7/Lpl/Olah/Insig1/Acacb/Acss2/Scd1/Fads1/Elovl5/Fasn/Scd4/Elovl6/Scd2/Acaca/Acly/Prkaa2/Ptgs1/Pla2g4f/Elovl1/Scd3
## GO:0072330 Stard4/Gstm7/Lpl/Olah/Insig1/Acacb/Acss2/Scd1/Fads1/Elovl5/Fasn/Scd4/Elovl6/Scd2/Acaca/Hoga1/Acly/Prkaa2/Ptgs1/Pla2g4f/Elovl1/Scd3
## GO:0046394 Stard4/Gstm7/Lpl/Olah/Insig1/Acacb/Acss2/Scd1/Fads1/Elovl5/Fasn/Scd4/Elovl6/Scd2/Acaca/Aass/Hoga1/Acly/Prkaa2/Ptgs1/Aldh18a1/Pla2g4f/Elovl1/Scd3
## GO:0016053 Stard4/Gstm7/Lpl/Olah/Insig1/Acacb/Acss2/Scd1/Fads1/Elovl5/Fasn/Scd4/Elovl6/Scd2/Acaca/Aass/Hoga1/Acly/Prkaa2/Ptgs1/Aldh18a1/Pla2g4f/Elovl1/Scd3
## GO:0006084 Acacb/Acss2/Pmvk/Fasn/Acaca/Mpc1/Dlat/Acly/Mpc2/Pdhb
## Count
## GO:0006631 32
## GO:0006633 20
## GO:0072330 22
## GO:0046394 24
## GO:0016053 24
## GO:0006084 10
## remove redundent GO terms
ego2 <- clusterProfiler::simplify(ego)
Dot plot:
ego@result$Description <- stringr::str_wrap(ego@result$Description, width = 100)
# Dotplot with improved readability
dotplot(ego, split = "ONTOLOGY", showCategory = 5) + # Create a dot plot of enriched terms from the ego object, displaying the top 5
facet_grid(ONTOLOGY ~ ., scales = "free") + # splits the plot into facets for each ontology
ggtitle("Dotplot for UP-regulated genes") +
theme(
panel.spacing = unit(1, "lines"), # Space between facets
axis.text.y = element_text( hjust = 1, size = 7),
plot.title = element_text(hjust = 0.5) # Put title in the center
)
Gene concept network:
cnetplot(ego2, showCategory = 3, foldChange = UPR.vir.lact$log2FoldChange, circular = TRUE, colorEdge = TRUE)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
## The foldChange parameter will be removed in the next version.
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(edge = your_value)' instead of 'colorEdge'.
## The colorEdge parameter will be removed in the next version.
## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
UpSet plot:
p <- upsetplot(ego2, n = 5)
p + scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, 2)) +
theme_minimal() + labs(y = "# Genes", x = "Intersect GOterms")+
ggtitle("Intersection of Top 5 GO Terms for UP-regulated genes") +
theme(plot.title = element_text(hjust = 0.5))
Tree plot:
edox <- pairwise_termsim(ego)
p1 <- treeplot(edox)
p1 + ggtitle("Treeplot of Enriched Terms for UP-regulated genes") +
theme(
plot.title = element_text(hjust = 0.5)
)
p2 <- treeplot(edox, hclust_method = "average")
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
## The hclust_method parameter will be removed in the next version.
p2 + ggtitle("Clustered Treeplot of Enriched Terms for UP-regulated genes") +
theme(
plot.title = element_text(hjust = 0.5)
)
metadata <- read.delim("sample_metadata.txt", header = TRUE, sep = "\t") # Load again the metadata
subSamplInfo <- metadata[metadata$Status %in% c("pregnant", "lactate"), ] # Filter for pregnant and lactate mice
# Get the column names from counts excluding the first two columns
counts_columns <- colnames(counts)[-c(1, 2)]
# Get the filenames from subSamplInfo
subsampl_files <- subSamplInfo$FileName
# Find the intersection of counts columns and filenames
matching_columns <- intersect(counts_columns, subsampl_files)
subCOUNTS <- counts[, colnames(counts) %in% matching_columns] # Not include the first 2 columns
rownames(subCOUNTS) <- counts$EntrezGeneID # Set EntrezGeneID as row names
See if the rows of the metadata are aligned with the columns of the counts matrix:
all(colnames(subCOUNTS) == subsampl_files) # has to be TRUE
## [1] TRUE
subSamplInfo$Status <- as.factor(subSamplInfo$Status) # Status treated as a factor
# construct a DESeqDataset object
dds <- DESeqDataSetFromMatrix(
countData = subCOUNTS,
colData = subSamplInfo,
design = ~ Status
)
# Pre-filtering
dds <- dds[ rowSums(DESeq2::counts(dds)) > 3, ]
# We set the factor level to pregnant mice in this case
dds$Status <- relevel(dds$Status , ref = "pregnant")
# Run DESeq
library(BiocParallel)
param <- MulticoreParam()
register(param)
dds <- DESeq(dds, parallel = T)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates: 1 workers
## mean-dispersion relationship
## final dispersion estimates, fitting model and testing: 1 workers
DEresults<- results(dds)
#sort results by increasing p-value
DEresults <- DEresults[order(DEresults$pvalue),]
DEresults
## log2 fold change (MLE): Status lactate vs pregnant
## Wald test p-value: Status lactate vs pregnant
## DataFrame with 18869 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## 67111 342.865 2.586276 0.2067976 12.50631 6.89518e-36 1.12426e-31
## 66234 773.961 2.231038 0.1921582 11.61042 3.64816e-31 2.97416e-27
## 69237 1319.425 -1.008621 0.0903078 -11.16870 5.80210e-29 3.15344e-25
## 67952 2047.203 -0.613893 0.0695262 -8.82966 1.04998e-18 4.27999e-15
## 67619 585.578 -1.101617 0.1284619 -8.57544 9.87098e-18 3.21893e-14
## ... ... ... ... ... ... ...
## 219150 395.1892 6.51393e-05 0.190371 0.000342170 0.999727 0.999736
## 17876 131.0867 -1.25242e-04 0.378766 -0.000330658 0.999736 0.999736
## 215866 114.1349 -4.85535e+00 1.910235 -2.541756729 NA NA
## 18948 16.6089 -6.53002e+00 2.260910 -2.888224910 NA NA
## 21943 60.0785 -6.95556e+00 2.167159 -3.209530294 NA NA
Let’s plot the diagnostic plots:
# MA plot
DESeq2::plotMA(object = dds, ylim = c(-5, 5), main = "MA plot pregnant-lactate")
# P-value distribution
ggplot(data = as.data.frame(DEresults), aes(x = pvalue)) +
geom_histogram(bins = 100)+
ggtitle("p-vale distribution pregnant-lactate") +
theme(plot.title = element_text(hjust = 0.5))
rld <- rlog(dds)
DESeq2::plotPCA(rld, ntop = 500, intgroup = 'Status') +
ylim(-50, 50) + theme_bw()+
ggtitle("PCA plot pregnant-lactate")
## using ntop=500 top features by variance
theme(plot.title = element_text(hjust = 0.5))
## List of 1
## $ plot.title:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0.5
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
Visualization:
library(org.Mm.eg.db) # Genome wide annotation for mice
DEG.preg.lact <- data.frame(DEresults$log2FoldChange, DEresults$lfcSE, DEresults$pvalue,
DEresults$padj, rownames(DEresults)) %>% `colnames<-`(c('log2FoldChange', 'lfcSE', 'pvalue', 'padj', 'EntrezGeneID'))
DEG.preg.lact$GenSymbl<-mapIds(org.Mm.eg.db,keys=DEG.preg.lact$EntrezGeneID,column="SYMBOL",
keytype="ENTREZID", multiVals="first") #
## 'select()' returned 1:1 mapping between keys and columns
head(DEG.preg.lact)
## log2FoldChange lfcSE pvalue padj EntrezGeneID GenSymbl
## 1 2.5862756 0.20679762 6.895182e-36 1.124259e-31 67111 Naaa
## 2 2.2310379 0.19215823 3.648158e-31 2.974161e-27 66234 Msmo1
## 3 -1.0086211 0.09030779 5.802102e-29 3.153442e-25 69237 Gtpbp4
## 4 -0.6138928 0.06952625 1.049982e-18 4.279991e-15 67952 Tomm20
## 5 -1.1016168 0.12846186 9.870978e-18 3.218926e-14 67619 Nob1
## 6 0.9283026 0.10902737 1.674554e-17 4.550601e-14 68728 Trp53inp2
Volcano plot:
DEG.preg.lact$diffexpressed <- "NO"
# if log2Foldchange > F2CLim and Padj < PadjLim, set as "UP"
DEG.preg.lact$diffexpressed[DEG.preg.lact$log2FoldChange > F2CLim & DEG.preg.lact$padj < PadjLim ] <- "UP"
# if log2Foldchange < -F2CLim and Padj < PadjLim, set as "DOWN"
DEG.preg.lact$diffexpressed[DEG.preg.lact$log2FoldChange < -F2CLim & DEG.preg.lact$padj < PadjLim] <- "DOWN"
p <- ggplot(data=DEG.preg.lact, aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed)) +
geom_point() + theme_minimal()
p2 <- p + geom_vline(xintercept=c(-F2CLim, F2CLim), col="red") +
geom_hline(yintercept=-log10(PadjLim), col="red")
## Change point color
p3 <- p2 + scale_color_manual(values=c("blue", "black", "red"))
mycolors <- c("blue", "red", "black")
names(mycolors) <- c("DOWN", "UP", "NO")
p3 <- p2 + scale_colour_manual(values = mycolors)+
ggtitle("Volcano plot pregnant-lactate") +
theme(plot.title = element_text(hjust = 0.5))
p3
# Write down the name of genes beside the points
# Create a new column "delabel"
DEG.preg.lact$delabel <- NA
DEG.preg.lact$delabel[DEG.preg.lact$diffexpressed != "NO"] <- DEG.preg.lact$GenSymbl[DEG.preg.lact$diffexpressed != "NO"]
# Organize the labels nicely using "ggrepel" package and geom_text_repel() function
library(ggrepel)
ggplot(data=DEG.preg.lact, aes(x=log2FoldChange, y=-log10(padj), col=diffexpressed, label=delabel)) +
geom_point() +
theme_minimal() +
geom_text_repel() +
scale_color_manual(values=c("blue", "black", "red")) +
geom_vline(xintercept=c(-F2CLim, F2CLim), col="red") +
geom_hline(yintercept=-log10(PadjLim), col="red") +
ggtitle("Volcano plot pregnant-lactate") +
theme(plot.title = element_text(hjust = 0.5))
Save the results:
UPR.preg.lact <- DEG.preg.lact[DEG.preg.lact$diffexpressed=="UP",]
DWR.preg.lact <- DEG.preg.lact[DEG.preg.lact$diffexpressed=="DOWN",]
# Define the universe genes for enrichment analysis
df <- as.data.frame(org.Mm.egGO)
go_gene_list <- unique(sort(df$gene_id)) # Mouse universe genes
# Obtain ENTREZ ID of the genes previously extracted
ent.Gene <- mapIds(
org.Mm.eg.db,
keys = DWR.preg.lact$EntrezGeneID, #down-regulated pregnant-lactate
keytype = "ENTREZID",
column = "ENTREZID",
multiVals = "first"
)
ego <- enrichGO(gene = na.omit(ent.Gene),
universe = go_gene_list,
OrgDb = org.Mm.eg.db,
ont = "all",
pAdjustMethod = "BH",
pvalueCutoff = 0.10,
qvalueCutoff = 0.05,
readable = TRUE)
head(ego)# Reference
## ONTOLOGY ID
## GO:0050727 BP GO:0050727
## GO:0042531 BP GO:0042531
## GO:0048145 BP GO:0048145
## GO:0030225 BP GO:0030225
## GO:0045649 BP GO:0045649
## GO:0042509 BP GO:0042509
## Description
## GO:0050727 regulation of inflammatory response
## GO:0042531 positive regulation of tyrosine phosphorylation of STAT protein
## GO:0048145 regulation of fibroblast proliferation
## GO:0030225 macrophage differentiation
## GO:0045649 regulation of macrophage differentiation
## GO:0042509 regulation of tyrosine phosphorylation of STAT protein
## GeneRatio BgRatio RichFactor FoldEnrichment zScore pvalue
## GO:0050727 8/68 411/28905 0.01946472 8.273937 7.212273 5.477737e-06
## GO:0042531 4/68 56/28905 0.07142857 30.362395 10.680178 9.381058e-06
## GO:0048145 5/68 127/28905 0.03937008 16.735178 8.629818 1.263380e-05
## GO:0030225 4/68 62/28905 0.06451613 27.424099 10.114262 1.409768e-05
## GO:0045649 3/68 22/28905 0.13636364 57.964572 12.979364 1.857185e-05
## GO:0042509 4/68 71/28905 0.05633803 23.947804 9.401048 2.416721e-05
## p.adjust qvalue
## GO:0050727 0.004737862 0.003448715
## GO:0042531 0.004737862 0.003448715
## GO:0048145 0.004737862 0.003448715
## GO:0030225 0.004737862 0.003448715
## GO:0045649 0.004737862 0.003448715
## GO:0042509 0.004737862 0.003448715
## geneID Count
## GO:0050727 Ctss/Alox15/Serpine1/Smpdl3b/Csf1r/Osm/Pik3ap1/Dnase1l3 8
## GO:0042531 Lif/Csf1r/Osm/Il24 4
## GO:0048145 Serpine1/Lif/Cd74/Rasgrf1/Ptprv 5
## GO:0030225 Csf1/C1qc/Lif/Csf1r 4
## GO:0045649 Csf1/C1qc/Lif 3
## GO:0042509 Lif/Csf1r/Osm/Il24 4
## remove redundent GO terms
ego2 <- clusterProfiler::simplify(ego)
Dot plot:
ego@result$Description <- stringr::str_wrap(ego@result$Description, width = 100)
# Dotplot with improved readability
dotplot(ego, split = "ONTOLOGY", showCategory = 5) + # Create a dot plot of enriched terms from the ego object, displaying the top 5
facet_grid(ONTOLOGY ~ ., scales = "free") + # splits the plot into facets for each ontology
ggtitle("Dotplot for DOWN-regulated genes") +
theme(
panel.spacing = unit(1, "lines"), # Space between facets
axis.text.y = element_text( hjust = 1, size = 7), # Rotate y-axis labels
plot.title = element_text(hjust = 0.5) # Put title in the center
)
Gene concept network:
cnetplot(ego2, showCategory = 3, foldChange = DWR.preg.lact$log2FoldChange, circular = TRUE, colorEdge = TRUE)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
## The foldChange parameter will be removed in the next version.
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(edge = your_value)' instead of 'colorEdge'.
## The colorEdge parameter will be removed in the next version.
UpSet plot:
p <- upsetplot(ego2, n = 5)
p + scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, 2)) +
theme_minimal() + labs(y = "# Genes", x = "Intersect GOterms")+
ggtitle("Intersection of Top 5 GO Terms for DOWN-regulated") +
theme(plot.title = element_text(hjust = 0.5))
Tree plot:
edox <- pairwise_termsim(ego)
p1 <- treeplot(edox)
p1 + ggtitle("Treeplot of Enriched Terms for DOWN-regulated genes") +
theme(
plot.title = element_text(hjust = 0.5)
)
p2 <- treeplot(edox, hclust_method = "average")
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
## The hclust_method parameter will be removed in the next version.
p2 + ggtitle("Clustered Treeplot of Enriched Terms for DOWN-regulated genes") +
theme(
plot.title = element_text(hjust = 0.5)
)
# Obtain ENTREZ ID of the genes previously extracted
ent.Gene <- mapIds(
org.Mm.eg.db,
keys = UPR.preg.lact$EntrezGeneID, #UP-regulated pregnant-lactate
keytype = "ENTREZID",
column = "ENTREZID",
multiVals = "first"
)
ego <- enrichGO(gene = na.omit(ent.Gene),
universe = go_gene_list,
OrgDb = org.Mm.eg.db,
ont = "all",
pAdjustMethod = "BH",
pvalueCutoff = 0.10,
qvalueCutoff = 0.05,
readable = TRUE)
head(ego)# Reference
## ONTOLOGY ID Description
## GO:0006695 BP GO:0006695 cholesterol biosynthetic process
## GO:1902653 BP GO:1902653 secondary alcohol biosynthetic process
## GO:0016126 BP GO:0016126 sterol biosynthetic process
## GO:0006633 BP GO:0006633 fatty acid biosynthetic process
## GO:0046165 BP GO:0046165 alcohol biosynthetic process
## GO:0072330 BP GO:0072330 monocarboxylic acid biosynthetic process
## GeneRatio BgRatio RichFactor FoldEnrichment zScore pvalue
## GO:0006695 6/74 60/28905 0.10000000 39.06081 14.951452 1.027241e-08
## GO:1902653 6/74 60/28905 0.10000000 39.06081 14.951452 1.027241e-08
## GO:0016126 6/74 66/28905 0.09090909 35.50983 14.219674 1.841865e-08
## GO:0006633 7/74 162/28905 0.04320988 16.87813 10.267288 2.002341e-07
## GO:0046165 6/74 143/28905 0.04195804 16.38915 9.346266 1.853513e-06
## GO:0072330 7/74 228/28905 0.03070175 11.99235 8.442212 1.990582e-06
## p.adjust qvalue
## GO:0006695 6.980103e-06 5.401126e-06
## GO:1902653 6.980103e-06 5.401126e-06
## GO:0016126 8.343648e-06 6.456221e-06
## GO:0006633 6.802952e-05 5.264048e-05
## GO:0046165 4.508669e-04 3.488757e-04
## GO:0072330 4.508669e-04 3.488757e-04
## geneID Count
## GO:0006695 Msmo1/Fdps/Insig1/Lss/Pmvk/Tm7sf2 6
## GO:1902653 Msmo1/Fdps/Insig1/Lss/Pmvk/Tm7sf2 6
## GO:0016126 Msmo1/Fdps/Insig1/Lss/Pmvk/Tm7sf2 6
## GO:0006633 Gstm7/Insig1/Scd2/Elovl6/Acacb/Ptgs1/Elovl5 7
## GO:0046165 Msmo1/Fdps/Insig1/Lss/Pmvk/Tm7sf2 6
## GO:0072330 Gstm7/Insig1/Scd2/Elovl6/Acacb/Ptgs1/Elovl5 7
## remove redundent GO terms
ego2 <- clusterProfiler::simplify(ego)
Dot plot:
ego@result$Description <- stringr::str_wrap(ego@result$Description, width = 100)
# Dotplot with improved readability
dotplot(ego, split = "ONTOLOGY", showCategory = 5) + # Create a dot plot of enriched terms from the ego object, displaying the top 5
facet_grid(ONTOLOGY ~ ., scales = "free") + # splits the plot into facets for each ontology
ggtitle("Dotplot for UP-regulated genes") +
theme(
panel.spacing = unit(1, "lines"), # Space between facets
axis.text.y = element_text( hjust = 1, size = 7),
plot.title = element_text(hjust = 0.5) # Put title in the center
)
Gene concept network:
cnetplot(ego2, showCategory = 3, foldChange = UPR.preg.lact$log2FoldChange, circular = TRUE, colorEdge = TRUE)
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(foldChange = your_value)' instead of 'foldChange'.
## The foldChange parameter will be removed in the next version.
## Warning in cnetplot.enrichResult(x, ...): Use 'color.params = list(edge = your_value)' instead of 'colorEdge'.
## The colorEdge parameter will be removed in the next version.
p <- upsetplot(ego2, n = 5)
p + scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, 2)) +
theme_minimal() + labs(y = "# Genes", x = "Intersect GOterms")+
ggtitle("Intersection of Top GO Terms for UP-regulated genes") +
theme(plot.title = element_text(hjust = 0.5))
Tree plot
edox <- pairwise_termsim(ego)
p1 <- treeplot(edox)
p1 + ggtitle("Treeplot of Enriched Terms for UP-regulated genes") +
theme(
plot.title = element_text(hjust = 0.5)
)
p2 <- treeplot(edox, hclust_method = "average")
## Warning in treeplot.enrichResult(x, ...): Use 'cluster.params = list(method = your_value)' instead of 'hclust_method'.
## The hclust_method parameter will be removed in the next version.
p2 + ggtitle("Clustered Treeplot of Enriched Terms for UP-regulated genes") +
theme(
plot.title = element_text(hjust = 0.5)
)